{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "opened-albania",
   "metadata": {},
   "source": [
    "# S-SAT demo using Landsat 8 dataset and Sentinel 2 metadata\n",
    "# S-SAT have spatial join and temporal join at the same time\n",
    "## This demo could be used to find Spatio-temporal intersection between the two datasets and the potential area of interest"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "accompanied-procurement",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "several-above",
   "metadata": {},
   "outputs": [],
   "source": [
    "### test temporal index\n",
    "## the basic idea is to generate a temporal id, and classify data into categories according to time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "included-saskatchewan",
   "metadata": {},
   "outputs": [],
   "source": [
    "# date in the format of \"yyy-MM-dd\", results include both start date and end date\n",
    "\n",
    "# The hour delay variable represents how many hours apart the two datasets are considered to be temporally intersected\n",
    "# E.g. if hour_delay = 6, for 2020-06-26 12:00:00, timestamps between 2020-06-26 06:00:00 and 2020-06-26 18:00:00 are considered to be temporally intersected\n",
    "start_date = '2018-01-01'\n",
    "end_date = '2018-03-31'\n",
    "duration_month = 3 # for export\n",
    "time_index_hour_length = 24\n",
    "hour_delay = 6"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "russian-shuttle",
   "metadata": {},
   "source": [
    "## please input the potential area of interest, we have \"Beaufort_Sea\" and \"Wandel_Sea\" in in this demo\n",
    "## if not using AOIs, could leave blank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "dirty-figure",
   "metadata": {},
   "outputs": [],
   "source": [
    "Paoi = 'Wandel_Sea'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "international-motorcycle",
   "metadata": {},
   "source": [
    "## First, setup the Spark and Sedona environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "heated-veteran",
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "#findspark.init() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "civil-cause",
   "metadata": {},
   "outputs": [],
   "source": [
    "SPARK_HOME='/opt/cloudera/parcels/CDH/lib/spark'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ordinary-relationship",
   "metadata": {},
   "outputs": [],
   "source": [
    "findspark.init(SPARK_HOME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "white-wales",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zhangp/.conda/envs/py37_environment/lib/python3.7/site-packages/geopandas/_compat.py:58: UserWarning: The installed version of PyGEOS is too old (0.6 installed, 0.8 required), and thus GeoPandas will not use PyGEOS.\n",
      "  UserWarning,\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import os\n",
    "import codecs\n",
    "import subprocess\n",
    "#from hdfs import InsecureClient\n",
    "import numpy as np\n",
    "#from pyspark import SparkContext\n",
    "from pyspark import SQLContext\n",
    "from pyspark.sql import Row\n",
    "from pyspark.sql import functions as F\n",
    "from pyspark.sql.types import *\n",
    "import rtree\n",
    "from pyspark.sql import Window\n",
    "#import igraph\n",
    "#from igraph import Graph\n",
    "import geofeather"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "entire-beverage",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark import StorageLevel\n",
    "import geopandas as gpd\n",
    "import pandas as pd\n",
    "from pyspark.sql.types import StructType\n",
    "from pyspark.sql.types import StructField\n",
    "from pyspark.sql.types import StringType\n",
    "from pyspark.sql.types import LongType\n",
    "from shapely.geometry import Point\n",
    "from shapely.geometry import Polygon\n",
    "\n",
    "from sedona.register import SedonaRegistrator\n",
    "from sedona.core.SpatialRDD import SpatialRDD\n",
    "from sedona.core.SpatialRDD import PointRDD\n",
    "from sedona.core.SpatialRDD import PolygonRDD\n",
    "from sedona.core.SpatialRDD import LineStringRDD\n",
    "from sedona.core.enums import FileDataSplitter\n",
    "from sedona.utils.adapter import Adapter\n",
    "from sedona.core.spatialOperator import KNNQuery\n",
    "from sedona.core.spatialOperator import JoinQuery\n",
    "from sedona.core.spatialOperator import JoinQueryRaw\n",
    "from sedona.core.spatialOperator import RangeQuery\n",
    "from sedona.core.spatialOperator import RangeQueryRaw\n",
    "from sedona.core.formatMapper.shapefileParser import ShapefileReader\n",
    "from sedona.core.formatMapper import WkbReader\n",
    "from sedona.core.formatMapper import WktReader\n",
    "from sedona.core.formatMapper import GeoJsonReader\n",
    "from sedona.sql.types import GeometryType\n",
    "from sedona.core.enums import GridType\n",
    "from sedona.core.SpatialRDD import RectangleRDD\n",
    "from sedona.core.enums import IndexType\n",
    "from sedona.core.geom.envelope import Envelope\n",
    "from sedona.utils import SedonaKryoRegistrator, KryoSerializer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "blocked-mistress",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
    "os.environ['YARN_CONF_DIR'] = \"/opt/cloudera/parcels/CDH/lib/spark/conf/yarn-conf\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "possible-glory",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession \\\n",
    ".builder \\\n",
    ".appName(\"test_test_1\") \\\n",
    ".master('yarn') \\\n",
    ".config(\"spark.serializer\", KryoSerializer.getName) \\\n",
    ".config(\"spark.kryo.registrator\", SedonaKryoRegistrator.getName) \\\n",
    ".config('spark.jars','sedona-core-2.4_2.11-1.0.0-incubating.jar,sedona-sql-2.4_2.11-1.0.0-incubating.jar,sedona-python-adapter-2.4_2.11-1.0.0-incubating.jar,sedona-viz-2.4_2.11-1.0.0-incubating.jar,geotools-wrapper-geotools-24.0.jar') \\\n",
    ".config('spark.executor.memory', '20g') \\\n",
    ".config('spark.driver.memory', '10g') \\\n",
    ".config('spark.sql.shuffle.partitions', 6144) \\\n",
    ".config('spark.executor.instances', '24') \\\n",
    ".config('spark.executor.cores', '5') \\\n",
    ".config('spark.rpc.message.maxSize', '1024') \\\n",
    ".config('spark.yarn.dist.archives', 'environment.tar.gz#environment') \\\n",
    ".getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "emerging-developer",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SedonaRegistrator.registerAll(spark)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "other-natural",
   "metadata": {},
   "source": [
    "## Second, read datasets\n",
    "### Read ICESat-2 data, Spark have different types of input methods, here we read from Hadoop file system, as our Spark 2.4 is built based on Hadoop Yarn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fuzzy-shower",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "empirical-fantasy",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).options(delimiter='|').csv(\"meta_l8_2018_cloud_sep.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "uniform-diagram",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).options(delimiter='|').csv(\"meta_s2_2018_cloud_sep.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "noted-mount",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).csv(\"meta_pr_2016.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "appointed-stretch",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "155924"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "cathedral-paper",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "461000"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "careful-governor",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "planned-picking",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|                .geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|  0|LC08_001004_20180706|     58.52|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  1|LC08_001004_20180722|     85.11|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  2|LC08_001004_20180807|      0.08|2018-08-07 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  3|LC08_001004_20180908|      1.85|2018-09-08 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  4|LC08_001005_20180417|      4.23|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  5|LC08_001005_20180706|     41.22|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  6|LC08_001005_20180722|     42.02|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  7|LC08_001005_20180807|       0.1|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  8|LC08_001005_20180823|     66.75|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  9|LC08_001005_20180908|     22.47|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 10|LC08_001006_20180401|      3.01|2018-04-01 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 11|LC08_001006_20180417|      4.27|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 12|LC08_001006_20180519|     36.96|2018-05-19 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 13|LC08_001006_20180604|      0.07|2018-06-04 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 14|LC08_001006_20180620|      1.37|2018-06-20 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 15|LC08_001006_20180706|      13.4|2018-07-06 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 16|LC08_001006_20180722|     43.27|2018-07-22 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 17|LC08_001006_20180807|      0.23|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 18|LC08_001006_20180823|     52.62|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 19|LC08_001006_20180908|     40.37|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_raw.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fatty-philadelphia",
   "metadata": {},
   "outputs": [],
   "source": [
    "# l8_df_raw = l8_df_raw.filter(F.col('cloudcover') <= 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "vital-transcription",
   "metadata": {},
   "outputs": [],
   "source": [
    "# l8_df_raw.filter(F.col('cloudcover') == 0).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "molecular-liability",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "155924"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "freelance-aircraft",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|                .geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|  0|20171226T094359_2...|     9.219|2018-02-02 11:37:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  1|20180101T082329_2...|    98.622|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  2|20180101T082329_2...|     100.0|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  3|20180101T082329_2...|   47.2689|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  4|20180101T082329_2...|   51.0769|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  5|20180101T082329_2...|   76.9772|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  6|20180101T082329_2...|   77.5421|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  7|20180101T082329_2...|   85.8672|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  8|20180101T082329_2...|   37.5433|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  9|20180101T082329_2...|   67.3636|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 10|20180101T082329_2...|   85.0339|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 11|20180101T082329_2...|   17.2088|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 12|20180101T082329_2...|   16.1119|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 13|20180101T082329_2...|    3.9833|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 14|20180101T082329_2...|   73.4103|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 15|20180101T082329_2...|   68.6584|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 16|20180101T082329_2...|   51.4231|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 17|20180101T082329_2...|   28.1423|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 18|20180101T082329_2...|   23.1729|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 19|20180101T082329_2...|    6.5445|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_raw.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "utility-director",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw = s2_df_raw.filter(F.col('cloudcover') <= 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "swedish-hunger",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "461000"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_df_raw.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "formal-therapist",
   "metadata": {},
   "source": [
    "### generate geo columns for all columns, using WKT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "quick-harvard",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_raw = s2_df_raw.withColumnRenamed('.geo', 'footprint')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "minimal-midnight",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def generate_WKT(GEE_coo_str):\n",
    "    if 'LinearRing' not in GEE_coo_str:\n",
    "        print('error geo type')\n",
    "        return 'unknown geo'\n",
    "    if re.search('(\\[\\[.+\\]\\])', GEE_coo_str):\n",
    "        gee_coo0 = re.search('(\\[\\[.+\\]\\])', GEE_coo_str).group(1)\n",
    "        test_com = re.compile('(\\[.+?)(\\,)(.+?\\])')\n",
    "        gee_coos_1 = test_com.sub(r'\\1 \\3', gee_coo0)\n",
    "        gee_coos_2 = gee_coos_1.replace('[', '').replace(']', '').replace(',', ', ')\n",
    "        return 'POLYGON ((' + gee_coos_2 + '))'\n",
    "    else:\n",
    "        return 'unknown format'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "decent-switch",
   "metadata": {},
   "outputs": [],
   "source": [
    "generate_WKT_udf = F.udf(generate_WKT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "opponent-harvest",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_raw = l8_df_raw.withColumnRenamed('.geo', 'footprint')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "therapeutic-hybrid",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt = l8_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "diagnostic-caution",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|           footprint|             WKT_geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|  0|LC08_001004_20180706|     58.52|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-10.924...|\n",
      "|  1|LC08_001004_20180722|     85.11|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-16.038...|\n",
      "|  2|LC08_001004_20180807|      0.08|2018-08-07 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-12.596...|\n",
      "|  3|LC08_001004_20180908|      1.85|2018-09-08 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-13.429...|\n",
      "|  4|LC08_001005_20180417|      4.23|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.204...|\n",
      "|  5|LC08_001005_20180706|     41.22|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-15.823...|\n",
      "|  6|LC08_001005_20180722|     42.02|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-20.743...|\n",
      "|  7|LC08_001005_20180807|       0.1|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-25.727...|\n",
      "|  8|LC08_001005_20180823|     66.75|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.249...|\n",
      "|  9|LC08_001005_20180908|     22.47|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.547...|\n",
      "| 10|LC08_001006_20180401|      3.01|2018-04-01 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.492...|\n",
      "| 11|LC08_001006_20180417|      4.27|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-21.456...|\n",
      "| 12|LC08_001006_20180519|     36.96|2018-05-19 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.341...|\n",
      "| 13|LC08_001006_20180604|      0.07|2018-06-04 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.095...|\n",
      "| 14|LC08_001006_20180620|      1.37|2018-06-20 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-22.339...|\n",
      "| 15|LC08_001006_20180706|      13.4|2018-07-06 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.103...|\n",
      "| 16|LC08_001006_20180722|     43.27|2018-07-22 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.090...|\n",
      "| 17|LC08_001006_20180807|      0.23|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.801...|\n",
      "| 18|LC08_001006_20180823|     52.62|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-24.993...|\n",
      "| 19|LC08_001006_20180908|     40.37|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-22.364...|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_wkt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "historical-ocean",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt = s2_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "brief-socket",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|           footprint|             WKT_geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|  0|20171226T094359_2...|     9.219|2018-02-02 11:37:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-6.5921...|\n",
      "|  1|20180101T082329_2...|    98.622|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((33.1015...|\n",
      "|  2|20180101T082329_2...|     100.0|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1301...|\n",
      "|  3|20180101T082329_2...|   47.2689|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1465...|\n",
      "|  4|20180101T082329_2...|   51.0769|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1574...|\n",
      "|  5|20180101T082329_2...|   76.9772|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1594...|\n",
      "|  6|20180101T082329_2...|   77.5421|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1689...|\n",
      "|  7|20180101T082329_2...|   85.8672|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0543...|\n",
      "|  8|20180101T082329_2...|   37.5433|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.0638...|\n",
      "|  9|20180101T082329_2...|   67.3636|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((36.0089...|\n",
      "| 10|20180101T082329_2...|   85.0339|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1079...|\n",
      "| 11|20180101T082329_2...|   17.2088|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((33.3628...|\n",
      "| 12|20180101T082329_2...|   16.1119|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1677...|\n",
      "| 13|20180101T082329_2...|    3.9833|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1008...|\n",
      "| 14|20180101T082329_2...|   73.4103|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0647...|\n",
      "| 15|20180101T082329_2...|   68.6584|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0756...|\n",
      "| 16|20180101T082329_2...|   51.4231|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0746...|\n",
      "| 17|20180101T082329_2...|   28.1423|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.3049...|\n",
      "| 18|20180101T082329_2...|   23.1729|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.6001...|\n",
      "| 19|20180101T082329_2...|    6.5445|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.3583...|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_wkt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "confused-hungarian",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw_pd = s2_df_raw.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "efficient-tsunami",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "jewish-block",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_wkt = pr_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "confident-exhibit",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "western-sympathy",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt = s2_df_wkt.withColumnRenamed('system:index', 's2_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "digital-validation",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: integer (nullable = true)\n",
      " |-- s2_index: string (nullable = true)\n",
      " |-- cloudcover: double (nullable = true)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- footprint: string (nullable = true)\n",
      " |-- WKT_geo: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_wkt.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "forward-seller",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: integer (nullable = true)\n",
      " |-- system:index: string (nullable = true)\n",
      " |-- cloudcover: double (nullable = true)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- footprint: string (nullable = true)\n",
      " |-- WKT_geo: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_wkt.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "identical-pierce",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt = l8_df_wkt.withColumnRenamed('system:index', 'l8_index')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "thrown-brook",
   "metadata": {},
   "source": [
    "### transfer to Sedona df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "adapted-rogers",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt.createOrReplaceTempView(\"l8_df_wkt\")\n",
    "l8_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as l8_geometry, l8_index as l8_img_id, timestamp as l8_timestep from l8_df_wkt\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "smart-principal",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the area for the l8\n",
    "\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "l8_df_sedona = spark.sql(\"select ST_Area(l8_geometry) as l8_area, * from l8_df_sedona\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "quiet-parking",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+--------------------+--------------------+--------------------+\n",
      "|           l8_area|         l8_geometry|           l8_img_id|         l8_timestep|\n",
      "+------------------+--------------------+--------------------+--------------------+\n",
      "|14.124304956531168|POLYGON ((-10.924...|LC08_001004_20180706|2018-07-06 14:07:...|\n",
      "|  14.1368958808218|POLYGON ((-16.038...|LC08_001004_20180722|2018-07-22 14:07:...|\n",
      "| 14.13397280878489|POLYGON ((-12.596...|LC08_001004_20180807|2018-08-07 14:07:...|\n",
      "| 14.21783504836524|POLYGON ((-13.429...|LC08_001004_20180908|2018-09-08 14:07:...|\n",
      "|13.013235111990365|POLYGON ((-19.204...|LC08_001005_20180417|2018-04-17 14:08:...|\n",
      "|13.016815047842258|POLYGON ((-15.823...|LC08_001005_20180706|2018-07-06 14:07:...|\n",
      "| 13.04422889786937|POLYGON ((-20.743...|LC08_001005_20180722|2018-07-22 14:07:...|\n",
      "|13.025145367910945|POLYGON ((-25.727...|LC08_001005_20180807|2018-08-07 14:08:...|\n",
      "|13.088991981998086|POLYGON ((-19.249...|LC08_001005_20180823|2018-08-23 14:08:...|\n",
      "|13.036544425947993|POLYGON ((-19.547...|LC08_001005_20180908|2018-09-08 14:08:...|\n",
      "|11.996375308726677|POLYGON ((-19.492...|LC08_001006_20180401|2018-04-01 14:08:...|\n",
      "| 12.04368416424424|POLYGON ((-21.456...|LC08_001006_20180417|2018-04-17 14:08:...|\n",
      "|11.994548598119355|POLYGON ((-29.341...|LC08_001006_20180519|2018-05-19 14:08:...|\n",
      "|11.907405621268417|POLYGON ((-29.095...|LC08_001006_20180604|2018-06-04 14:07:...|\n",
      "|12.011768123945448|POLYGON ((-22.339...|LC08_001006_20180620|2018-06-20 14:08:...|\n",
      "|11.997177863781467|POLYGON ((-29.103...|LC08_001006_20180706|2018-07-06 14:08:...|\n",
      "|12.036923004713366|POLYGON ((-29.090...|LC08_001006_20180722|2018-07-22 14:08:...|\n",
      "|11.964717576376385|POLYGON ((-19.801...|LC08_001006_20180807|2018-08-07 14:08:...|\n",
      "|11.985447360963212|POLYGON ((-24.993...|LC08_001006_20180823|2018-08-23 14:08:...|\n",
      "|11.987771678599081|POLYGON ((-22.364...|LC08_001006_20180908|2018-09-08 14:08:...|\n",
      "+------------------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "consolidated-flavor",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt.createOrReplaceTempView(\"s2_df_wkt\")\n",
    "s2_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as s2_geometry, s2_index as s2_id, timestamp as s2_timestamp from s2_df_wkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "stone-aging",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the area for the s2\n",
    "\n",
    "s2_df_sedona.createOrReplaceTempView(\"s2_df_sedona\")\n",
    "s2_df_sedona = spark.sql(\"select ST_Area(s2_geometry) as s2_area, * from s2_df_sedona\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "engaged-qualification",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|\n",
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "lesbian-guard",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "searching-secretary",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_wkt.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "descending-relay",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_wkt.createOrReplaceTempView(\"pr_df_wkt\")\n",
    "# pr_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as pr_geometry, img_id as pr_id, timestep as pr_timestep from pr_df_wkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "unable-series",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cloudy-subscriber",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "raised-envelope",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "technical-perth",
   "metadata": {},
   "source": [
    "### prepare timestamps for temporal join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "related-going",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('l8_timestamp_date', F.to_timestamp(F.col('l8_timestep'), 'yyyy-MM-dd HH:mm:ss'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "executed-matrix",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|           l8_area|         l8_geometry|           l8_img_id|         l8_timestep|  l8_timestamp_date|\n",
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|14.124304956531168|POLYGON ((-10.924...|LC08_001004_20180706|2018-07-06 14:07:...|2018-07-06 14:07:29|\n",
      "|  14.1368958808218|POLYGON ((-16.038...|LC08_001004_20180722|2018-07-22 14:07:...|2018-07-22 14:07:36|\n",
      "| 14.13397280878489|POLYGON ((-12.596...|LC08_001004_20180807|2018-08-07 14:07:...|2018-08-07 14:07:44|\n",
      "| 14.21783504836524|POLYGON ((-13.429...|LC08_001004_20180908|2018-09-08 14:07:...|2018-09-08 14:07:58|\n",
      "|13.013235111990365|POLYGON ((-19.204...|LC08_001005_20180417|2018-04-17 14:08:...|2018-04-17 14:08:04|\n",
      "|13.016815047842258|POLYGON ((-15.823...|LC08_001005_20180706|2018-07-06 14:07:...|2018-07-06 14:07:52|\n",
      "| 13.04422889786937|POLYGON ((-20.743...|LC08_001005_20180722|2018-07-22 14:07:...|2018-07-22 14:07:59|\n",
      "|13.025145367910945|POLYGON ((-25.727...|LC08_001005_20180807|2018-08-07 14:08:...|2018-08-07 14:08:08|\n",
      "|13.088991981998086|POLYGON ((-19.249...|LC08_001005_20180823|2018-08-23 14:08:...|2018-08-23 14:08:16|\n",
      "|13.036544425947993|POLYGON ((-19.547...|LC08_001005_20180908|2018-09-08 14:08:...|2018-09-08 14:08:22|\n",
      "|11.996375308726677|POLYGON ((-19.492...|LC08_001006_20180401|2018-04-01 14:08:...|2018-04-01 14:08:37|\n",
      "| 12.04368416424424|POLYGON ((-21.456...|LC08_001006_20180417|2018-04-17 14:08:...|2018-04-17 14:08:28|\n",
      "|11.994548598119355|POLYGON ((-29.341...|LC08_001006_20180519|2018-05-19 14:08:...|2018-05-19 14:08:08|\n",
      "|11.907405621268417|POLYGON ((-29.095...|LC08_001006_20180604|2018-06-04 14:07:...|2018-06-04 14:07:57|\n",
      "|12.011768123945448|POLYGON ((-22.339...|LC08_001006_20180620|2018-06-20 14:08:...|2018-06-20 14:08:07|\n",
      "|11.997177863781467|POLYGON ((-29.103...|LC08_001006_20180706|2018-07-06 14:08:...|2018-07-06 14:08:16|\n",
      "|12.036923004713366|POLYGON ((-29.090...|LC08_001006_20180722|2018-07-22 14:08:...|2018-07-22 14:08:23|\n",
      "|11.964717576376385|POLYGON ((-19.801...|LC08_001006_20180807|2018-08-07 14:08:...|2018-08-07 14:08:32|\n",
      "|11.985447360963212|POLYGON ((-24.993...|LC08_001006_20180823|2018-08-23 14:08:...|2018-08-23 14:08:40|\n",
      "|11.987771678599081|POLYGON ((-22.364...|LC08_001006_20180908|2018-09-08 14:08:...|2018-09-08 14:08:46|\n",
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "literary-thermal",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_sedona = s2_df_sedona.withColumn('s2_timestamp_date', F.to_timestamp(F.col('s2_timestamp'), 'yyyy-MM-dd HH:mm:ss'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "adverse-trail",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "departmental-logistics",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract data within potential time slots\n",
    "import time\n",
    "import datetime\n",
    "\n",
    "start_timestamp = time.mktime(datetime.datetime.strptime(start_date, \"%Y-%m-%d\").timetuple())\n",
    "start_timestamp = start_timestamp - hour_delay * 3600 # for potential time delay\n",
    "\n",
    "end_timestamp = time.mktime(datetime.datetime.strptime(end_date, \"%Y-%m-%d\").timetuple())\n",
    "end_timestamp = end_timestamp + hour_delay * 3600 # for potential time delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "smoking-orientation",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "33308"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_sedona = l8_df_sedona.filter(F.unix_timestamp(F.col('l8_timestep')) <= end_timestamp).filter(F.unix_timestamp(F.col('l8_timestep')) >= start_timestamp)\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "\n",
    "## Show the schema of the table\n",
    "spark.table(\"l8_df_sedona\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "complicated-attempt",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_sedona = s2_df_sedona.filter(F.unix_timestamp(F.col('s2_timestamp')) <= end_timestamp).filter(F.unix_timestamp(F.col('s2_timestamp')) >= start_timestamp)\n",
    "s2_df_sedona.createOrReplaceTempView(\"s2_df_sedona\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e7ae2bd-80c3-406f-ad8c-f138c665f24d",
   "metadata": {},
   "source": [
    "## the temporal slicing index is used in the STSI strategy, not using here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "ignored-genome",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1514782800.0"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# try to generate a temporal index\n",
    "# this is used in the STSI strategy, not using here\n",
    "\n",
    "# find start timestamp\n",
    "import datetime\n",
    "import time\n",
    " \n",
    "# assigned regular string date\n",
    "start_date_datetype = datetime.datetime(2018, 1, 1, 0, 0)\n",
    "# print(\"date_time =>\",date_time)\n",
    " \n",
    "start_date_timestamp = time.mktime(start_date_datetype.timetuple())\n",
    "start_date_timestamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "twenty-plymouth",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get time_duration and hour_delay into seconds for comparison\n",
    "time_index_hour_length_second = time_index_hour_length * 3600\n",
    "hour_delay_second = hour_delay * 3600"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "isolated-regression",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "86400"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time_index_hour_length_second"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "sharing-booking",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index = s2_df_sedona.withColumn('time_index', F.floor((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) / F.lit(time_index_hour_length_second)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "apparent-vienna",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "growing-aluminum",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_early = sentinel_sedona_time_index.filter(((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) % F.lit(time_index_hour_length_second)) <= hour_delay_second)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "forward-greenhouse",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "|s2_area|s2_geometry|s2_id|s2_timestamp|s2_timestamp_date|time_index|\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index_early.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hairy-partner",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "ignored-assembly",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_late = sentinel_sedona_time_index.filter(((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) % F.lit(time_index_hour_length_second)) >= (time_index_hour_length_second - hour_delay_second))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "rocky-sport",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "|s2_area|s2_geometry|s2_id|s2_timestamp|s2_timestamp_date|time_index|\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index_late.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "sapphire-arena",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "proud-fitness",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin = sentinel_sedona_time_index.withColumn('time_index_join', F.col('time_index')).union(sentinel_sedona_time_index_late.withColumn('time_index_join', (F.col('time_index') + 1))).union(sentinel_sedona_time_index_early.withColumn('time_index_join', (F.col('time_index') - 1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "extreme-raleigh",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|time_index_join|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|             32|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "exterior-incident",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "69679"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "hispanic-agent",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "69679"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.drop_duplicates(subset=['s2_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "biblical-academy",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "italian-george",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "regular-charity",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dietary-worship",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "equipped-processor",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "graduate-upper",
   "metadata": {},
   "source": [
    "## Third, spatio-temporal join Sentinel dataset and L8 dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "living-relevance",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.1 join the two Sedona dataframe according the timestamps, hour level at now, consider time delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "geographic-saint",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin = sentinel_sedona_dateforjoin.withColumn('sentinel_max_timestamp', (F.unix_timestamp(\"s2_timestamp_date\") + hour_delay * 3600)).\\\n",
    "withColumn('sentinel_min_timestamp', (F.unix_timestamp(\"s2_timestamp_date\") - hour_delay * 3600))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "proprietary-official",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|time_index_join|sentinel_max_timestamp|sentinel_min_timestamp|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|             32|            1517611075|            1517567875|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "heated-verse",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "industrial-damage",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('time_index_join', F.floor((F.unix_timestamp('l8_timestamp_date') - F.lit(start_date_timestamp)) / F.lit(time_index_hour_length_second)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "rising-slide",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('l8_timestamp', F.unix_timestamp(F.col('l8_timestamp_date')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "renewable-terrace",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- l8_area: double (nullable = false)\n",
      " |-- l8_geometry: geometry (nullable = false)\n",
      " |-- l8_img_id: string (nullable = true)\n",
      " |-- l8_timestep: timestamp (nullable = true)\n",
      " |-- l8_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- l8_timestamp: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "tender-error",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- s2_area: double (nullable = false)\n",
      " |-- s2_geometry: geometry (nullable = false)\n",
      " |-- s2_id: string (nullable = true)\n",
      " |-- s2_timestamp: timestamp (nullable = true)\n",
      " |-- s2_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index: long (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- sentinel_max_timestamp: long (nullable = true)\n",
      " |-- sentinel_min_timestamp: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "infectious-economy",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.createOrReplaceTempView(\"sentinel_sedona_dateforjoin_maxmin\")\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "test_temporal_indexed_spatial_join = spark.sql(\"SELECT * FROM sentinel_sedona_dateforjoin_maxmin, l8_df_sedona WHERE ST_Intersects(sentinel_sedona_dateforjoin_maxmin.s2_geometry, l8_df_sedona.l8_geometry) AND (sentinel_max_timestamp >= l8_timestamp) AND (sentinel_min_timestamp <= l8_timestamp)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "golden-gateway",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "noticed-strip",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # to gpd for gpd base-line\n",
    "# gpd.GeoDataFrame(l8_df_sedona.toPandas()[['l8_geometry', 'l8_img_id', 'l8_timestamp']], geometry='l8_geometry').to_file('L8_1801-03_all.shp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "emotional-montreal",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpd.GeoDataFrame(\\\n",
    "#                  sentinel_sedona_dateforjoin_maxmin.toPandas()\\\n",
    "#                  [['s2_geometry', 's2_id', 'sentinel_max_timestamp', 'sentinel_min_timestamp']]\\\n",
    "#                  .rename({'sentinel_max_timestamp': 'S2_T_max', 'sentinel_min_timestamp': 'S2_T_min'}, axis=1), geometry='s2_geometry')\\\n",
    "# .to_file('S2_1801-03_all.shp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "centered-identity",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "empirical-ireland",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_temporal_indexed_spatial_join.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "alive-while",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import time\n",
    "# start_spatial_time = time.time()\n",
    "# test_spatial_join.select('uuid', 'Description', 'is_timestamp').write.format('com.databricks.spark.csv')\\\n",
    "#         .mode(\"overwrite\")\\\n",
    "#         .option('header', True)\\\n",
    "#         .save('sentinel_is2_intersect_pure_spatial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "strategic-instruction",
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('speed_pure_spatial_join_month.txt', 'w') as f:\n",
    "#     f.write('pure spatial join time:')\n",
    "#     f.write(str(time.time() - start_spatial_time))\n",
    "#     f.write('/n')\n",
    "#     f.write('number of spatial intersections:')\n",
    "#     f.write(str(test_spatial_join.count()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "velvet-theater",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- s2_area: double (nullable = false)\n",
      " |-- s2_geometry: geometry (nullable = false)\n",
      " |-- s2_id: string (nullable = true)\n",
      " |-- s2_timestamp: timestamp (nullable = true)\n",
      " |-- s2_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index: long (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- sentinel_max_timestamp: long (nullable = true)\n",
      " |-- sentinel_min_timestamp: long (nullable = true)\n",
      " |-- l8_area: double (nullable = false)\n",
      " |-- l8_geometry: geometry (nullable = false)\n",
      " |-- l8_img_id: string (nullable = true)\n",
      " |-- l8_timestep: timestamp (nullable = true)\n",
      " |-- l8_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- l8_timestamp: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test_temporal_indexed_spatial_join.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "instant-distinction",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join = test_temporal_indexed_spatial_join.filter(F.col('sentinel_max_timestamp') >= F.col('l8_timestamp')).filter(F.col('sentinel_min_timestamp') <= F.col('l8_timestamp'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "about-wealth",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_temporal_indexed_spatial_join.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "agricultural-handling",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "start_temporal_time = time.time()\n",
    "test_temporal_indexed_spatial_join.select('s2_id', 'l8_img_id', 's2_timestamp', 'l8_timestamp').write.format('com.databricks.spark.csv')\\\n",
    "        .mode(\"overwrite\")\\\n",
    "        .option('header', True)\\\n",
    "        .save('sentinel_L8_intersect_spatial_temporal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "forward-prefix",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'duration_month' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mICESpark_SSAT_\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m_months_\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m_hour_delay_S2L8.txt\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;28mstr\u001b[39m(duration_month), \u001b[38;5;28mstr\u001b[39m(hour_delay)), \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m      2\u001b[0m     f\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mS-SAT time in total:\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m      3\u001b[0m     f\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;28mstr\u001b[39m(time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start_temporal_time))\n",
      "\u001b[0;31mNameError\u001b[0m: name 'duration_month' is not defined"
     ]
    }
   ],
   "source": [
    "with open('ICESpark_SSAT_{}_months_{}_hour_delay_S2L8.txt'.format(str(duration_month), str(hour_delay)), 'w') as f:\n",
    "    f.write('S-SAT time in total:')\n",
    "    f.write(str(time.time() - start_temporal_time))\n",
    "    f.write('\\n')\n",
    "#     f.write('number of spatial intersections:')\n",
    "#     f.write(str(test_temporal_indexed_spatial_join.count()))\n",
    "    f.write('\\n')\n",
    "    f.write('from and to date')\n",
    "    f.write(start_date)\n",
    "    f.write('\\n')\n",
    "    f.write(end_date)\n",
    "    f.write('\\n')\n",
    "    f.write('L8 data count:')\n",
    "    f.write(str(l8_df_sedona.count()))\n",
    "    f.write('\\n')\n",
    "\n",
    "    f.write('Sentinel 2 data count:')\n",
    "    f.write(str(s2_df_sedona.count()))\n",
    "    f.write('\\n')\n",
    "    f.write('hour_delay:')\n",
    "    f.write(str(hour_delay))\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "threatened-render",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join.select('s2_index', 'Description', 'is_timestamp').drop_duplicates(subset=['s2_index', 'Description']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "atmospheric-range",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "academic-water",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \"fhldsai{}_{}\".format(str(2), 'se')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "incident-toddler",
   "metadata": {},
   "outputs": [],
   "source": [
    "# boundaries in the format of polygon）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "endless-vector",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd = test_ST_join.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "precise-mobile",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'test_ST_join_pd' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-92-b2e4339079fe>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_ST_join_pd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'test_ST_join_pd' is not defined"
     ]
    }
   ],
   "source": [
    "test_ST_join_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "elder-grill",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpd.GeoSeries(test_ST_join_area_pd['geometry']).buffer(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "extended-collect",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join_pd['intersection'] = gpd.GeoSeries(test_ST_join_pd['s2_geometry']).buffer(0).intersection(gpd.GeoSeries(test_ST_join_pd['l8_geometry']).buffer(0), align=False)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "basic-mentor",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join_pd['intersection_area'] = gpd.GeoSeries(test_ST_join_pd['intersection']).area\n",
    "test_ST_join_pd['s2_pd_area'] = gpd.GeoSeries(test_ST_join_pd['s2_geometry']).area\n",
    "test_ST_join_pd['l8_pd_area'] = gpd.GeoSeries(test_ST_join_pd['l8_geometry']).area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "public-inventory",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join_pd['intersection_percentage'] = test_ST_join_pd['intersection_area'] / test_ST_join_pd[['s2_pd_area', 's2_pd_area']].min(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cognitive-enhancement",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "congressional-ethnic",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join_pd.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fundamental-insured",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join_pd[test_ST_join_pd['intersection_percentage'] > 0.5].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "clear-tomorrow",
   "metadata": {},
   "outputs": [],
   "source": [
    "78130 / 364922"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fresh-treat",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_l8_ST_join_valid_true_pd[s2_l8_ST_join_valid_true_pd['intersection_area'] / s2_l8_ST_join_valid_true_pd['s2_pd_area'] > 0.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "streaming-catch",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_area_intersect = s2_l8_ST_join_valid_true_pd[s2_l8_ST_join_valid_true_pd['intersection_area'] / s2_l8_ST_join_valid_true_pd['s2_pd_area'] > 0.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "iraqi-maria",
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('ICESpark_temporal_indexed_join_{}_months_{}_hour_delay_S2L8.txt'.format(str(duration_month), str(hour_delay)), 'a+') as f:\n",
    "#     f.write('percentage of intersection area larger than 50% of the smaller RS image')\n",
    "#     f.write('\\n')\n",
    "#     f.write(str(test_ST_join_pd[test_ST_join_pd['intersection_percentage'] > 0.5].shape[0] / test_ST_join_pd.shape[0]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "sustainable-coordination",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
